We will use Eurostat’s indicator data for “Gender pay gap in unadjusted form” as a case study when working with the data.table R package and apply a few animations using gganimate. The goal is to explore the geographical and time trends for the gender pay gap in the EU and compare Portugal with some EU’s countries
We will start by learning about and understanding the raw data, which we will then process (“wrangle”) in preparation for some exploratory analysis.*
The objective is to look at the geographical and time trends in the data. We will answer the following questions:
% of average gross hourly earnings of men
The indicator measures the difference between average gross hourly earnings of male paid employees and of female paid employees as a percentage of average gross hourly earnings of male paid employees. The indicator has been defined as unadjusted, because it gives an overall picture of gender inequalities in terms of pay and measures a concept which is broader than the concept of equal pay for equal work. All employees working in firms with ten or more employees, without restrictions for age and hours worked, are included.
The indicator measures the difference between average gross hourly earnings of male paid employees and of female paid employees as a percentage of average gross hourly earnings of male paid employees. The indicator has been defined as unadjusted, because it gives an overall picture of gender inequalities in terms of pay and measures a concept which is broader than the concept of equal pay for equal work. All employees working in firms with ten or more employees, without restrictions for age and hours worked, are included.
Overall data coverage: 2002 — 2018 Number of values: 433
% of average gross hourly earnings of men.
ESS (SES) Source of data: Eurostat Data source: Structure of Earnings Survey (SES) Data provider: Eurostat, the statistical office of the European Union, based on data reported by the countries. source link: https://ec.europa.eu/eurostat/databrowser/view/sdg_05_20/ online data code: SDG_05_20
Copyrights: Eurostat Copyright/Licence Policy is applicable.
query <- search_eurostat(pattern = "Gender pay gap in unadjusted form",
type = "table", fixed = FALSE)
query[, 1:2]## # A tibble: 4 x 2
## title code
## <chr> <chr>
## 1 Gender pay gap in unadjusted form sdg_05_20
## 2 Gender pay gap in unadjusted form sdg_05_20
## 3 Gender pay gap in unadjusted form tesem180
## 4 Gender pay gap in unadjusted form sdg_05_20
## # A tibble: 2 x 5
## unit nace_r2 geo time values
## <chr> <chr> <chr> <dbl> <dbl>
## 1 PC B-S_X_O AT 2002 NA
## 2 PC B-S_X_O AT 2006 25.5
dat1<- get_eurostat(id="sdg_05_20", time_format = "num",
filters = list (geo = PTandEU))
dat1 [1:2,]## # A tibble: 2 x 5
## unit nace_r2 geo time values
## <chr> <chr> <chr> <dbl> <dbl>
## 1 PC B-S_X_O EU27_2020 2002 NA
## 2 PC B-S_X_O EU27_2020 2006 NA
## # A tibble: 3 x 5
## unit nace_r2 geo time values
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Percent… Industry, construction and services (except publ… Austr… 2002 NA
## 2 Percent… Industry, construction and services (except publ… Austr… 2006 25.5
## 3 Percent… Industry, construction and services (except publ… Austr… 2007 25.5
## # A tibble: 3 x 5
## unit nace_r2 geo time values
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Percent… Industry, construction and services… European Union - 2… 2002 NA
## 2 Percent… Industry, construction and services… European Union - 2… 2006 NA
## 3 Percent… Industry, construction and services… European Union - 2… 2007 NA
(there is no available data for 2019)
dat <- get_eurostat(id="sdg_05_20", filters = list(geo = ct))
library(ggplot2)
library(dplyr)
ggplot(dat,
aes(x = time, y= values, color = geo, label = geo)) +
geom_line (alpha = .5) +
geom_text (data = dat %>% group_by(geo) %>%
filter(time == max(time)),
size =2.6) +
theme(legend.position = "none") +
labs(title = "Gender Pay Gap, 2003-2018",
x= "Year", y= "%") labs(title = 'Year: {frame_time}', x = 'Time', y = 'Percentage') +
transition_time(time) +
ease_aes('linear')## NULL
ggplot(dat, aes(time, values, color = geo, label = geo)) +
geom_line(alpha = .5) +
geom_text (data = dat %>% group_by(geo) %>%
filter(time == max(time)),
size =2.6) +
theme(legend.position = "none") +
scale_color_viridis_d() +
labs(x = "Year", y = "Gender Pay Gap") +
theme(legend.position = "top") +
labs(title = "Gender Pay Gap, 2002-2018",
x= "Year", y= "%") +
transition_reveal(time) +
geom_point() +
transition_reveal(time)ggplot (dat, aes(x= reorder(geo, values), y = values)) +
geom_col (color = "white", fill = "grey80") +
theme ( axis.text.x = element_text (size = 6)) +
labs( title = "Gender Pay Gap in 2019",
subtitle = "% of average gross hourly earnings of men",
fill = "%") +
labs(title = 'Year: {frame_time}') +
transition_time(time) +
ease_aes()Portugal has no available data until 2006 and the EU only has available data from 2010 on (again, there is no available data for 2019)
pteu <- c("European Union - 27 countries (from 2020)","Portugal")
library(ggplot2)
library(dplyr)
ggplot(dat1,
aes(x = time, y= values, color = geo, label = geo)) +
geom_line (alpha = .5) +
geom_text (data = dat1 %>% group_by(geo) %>%
filter(time == max(time)),
size =2.6) +
theme(legend.position = "none") +
labs(title = "Gender Pay Gap, 2006-2019",
x= "Year", y= "%")Now we compare the first and the latest years [2002 vs. 2019] provided in the dataset individually.
dat_2002 <- dat %>%
filter(time == "2002-01-01")
ggplot (dat_2002, aes(x= reorder(geo, values), y = values)) +
geom_col (color = "white", fill = "grey80") +
theme ( axis.text.x = element_text (size = 6)) +
labs (title = "Gender Pay Gap in 2002",
y = "%", x = NULL)mapdata <- get_eurostat_geospatial(nuts_level = 0) %>%
right_join (dat_2002) %>%
mutate (cat = cut_to_classes (values, n = 4, decimals = 1))
head(select(mapdata,geo,values,cat), 3)## Simple feature collection with 3 features and 3 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 5.95607 ymin: 34.56908 xmax: 34.56859 ymax: 47.80401
## geographic CRS: WGS 84
## geo values cat geometry
## 1 BG 18.9 16.9 ~< 22.3 MULTIPOLYGON (((22.99717 43...
## 2 CH NA No data MULTIPOLYGON (((8.61383 47....
## 3 CY 22.5 22.3 ~< 27.7 MULTIPOLYGON (((33.75237 34...
ggplot(mapdata, aes(fill = cat)) +
scale_fill_brewer(palette = "RdYlBu") +
geom_sf (color = alpha("white", 1/3), alpha = .6) +
xlim (c(-12,44)) + ylim(c(35, 70)) +
labs( title = "Gender Pay Gap in 2002",
subtitle = "% of average gross hourly earnings of men",
fill = "%")only a few countries have available data for 2002 Portugal only has available data from 2006 on
library(gghighlight)
dat_2006 <- dat %>%
filter(time == "2006-01-01")
ggplot (dat_2006, aes(x= reorder(geo, values), y = values)) +
geom_col (color = "white", fill = "tomato") +
gghighlight(geo == "PT") +
theme ( axis.text.x = element_text (size = 6)) +
labs (title = "Gender Pay Gap in 2006",
y = "%", x = NULL)Portugal is highlighted for analysis and comparison
mapdata <- get_eurostat_geospatial(nuts_level = 0) %>%
right_join (dat_2006) %>%
mutate (cat = cut_to_classes (values, n = 4, decimals = 1))
head(select(mapdata,geo,values,cat), 3)## Simple feature collection with 3 features and 3 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 5.95607 ymin: 34.56908 xmax: 34.56859 ymax: 47.80401
## geographic CRS: WGS 84
## geo values cat geometry
## 1 BG 12.4 10.8 ~< 17.1 MULTIPOLYGON (((22.99717 43...
## 2 CH 18.6 17.1 ~< 23.4 MULTIPOLYGON (((8.61383 47....
## 3 CY 21.8 17.1 ~< 23.4 MULTIPOLYGON (((33.75237 34...
ggplot(mapdata, aes(fill = cat)) +
scale_fill_brewer(palette = "RdYlBu") +
geom_sf (color = alpha("white", 1/3), alpha = .6) +
xlim (c(-12,44)) + ylim(c(35, 70)) +
labs( title = "Gender Pay Gap in 2006",
subtitle = "% of average gross hourly earnings of men",
fill = "%") ## Gender Pay Gap in 2018
(again, there is no available data for 2019)
dat_2018 <- dat %>%
filter(time == "2018-01-01")
ggplot (dat_2018, aes(x= reorder(geo, values), y = values)) +
geom_col (color = "white", fill = "tomato") +
gghighlight(geo == "PT") +
theme ( axis.text.x = element_text (size = 6)) +
labs (title = "Gender Pay Gap in 2018",
y = "%", x = NULL)mapdata <- get_eurostat_geospatial(nuts_level = 0) %>%
right_join (dat_2018) %>%
mutate (cat = cut_to_classes (values, n = 4, decimals = 1))
head(select(mapdata,geo,values,cat), 3)## Simple feature collection with 3 features and 3 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 5.95607 ymin: 34.56908 xmax: 34.56859 ymax: 47.80401
## geographic CRS: WGS 84
## geo values cat geometry
## 1 BG 13.9 11.6 ~< 16.7 MULTIPOLYGON (((22.99717 43...
## 2 CH NA No data MULTIPOLYGON (((8.61383 47....
## 3 CY 10.4 6.5 ~< 11.6 MULTIPOLYGON (((33.75237 34...
ggplot(mapdata, aes(fill = cat)) +
scale_fill_brewer(palette = "RdYlBu") +
geom_sf (color = alpha("white", 1/3), alpha = .6) +
xlim (c(-12,44)) + ylim(c(35, 70)) +
labs( title = "Gender Pay Gap in 2018",
subtitle = "% of average gross hourly earnings of men",
fill = "%")